package data_deepprocessing.algorithm.ssvm.ssvm_util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.deeplearning4j.models.word2vec.Word2Vec;

import data_deepprocessing.algorithm.ssvm.bean.WordMarkBean;
import data_deepprocessing.algorithm.word_embedding.service.YYH_Word2Vector_EN_Service;
import data_deepprocessing.util.FileUtil;

/** 
* @author  作者 : YUHU YUAN
* @date 创建时间：2017年4月10日 下午8:06:30 
* @version 1.0  
*/

public class CRFplusCrossValidService2EN {
	
	
	
	private Word2Vec doGetWord2Vec(){
		
		try {
			return YYH_Word2Vector_EN_Service.getWord2Vec();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}
	
	
	
	
	/** 
	* @author  作者 : YUHU YUAN
	* @date 创建时间：2017年4月9日 下午10:37:03 
	* @parameter 
	* @return
	* @throws
	*/
	private void doGenerateCRFDataSet2Train(String outputPath, String inputPath,Word2Vec word2Vec) throws IOException{
		BufferedWriter writer = FileUtil.getWriter(outputPath);
		File[] crfFiles = new File(inputPath).listFiles();
		List<WordMarkBean> wordMarkBeans = null;
		BufferedReader reader = null;
		for (File file : crfFiles) {
			reader = FileUtil.getReader(file);
			wordMarkBeans = new ArrayList<>();
			String line = "";
			while ((line = reader.readLine()) != null) {
				String[] temp = line.split("\t");
				WordMarkBean bean = new WordMarkBean(temp[0], temp[1]);
				wordMarkBeans.add(bean);
			}

			StringBuffer constructTerm = new StringBuffer();
			for (WordMarkBean bean : wordMarkBeans) {
				String word = bean.getWord().toLowerCase();
				double[] voctor = word2Vec.getWordVector(word);
				String mark = bean.getMark();
				String tag = "";
				if (mark.equals("O")) {
					tag = "1";
				} else if (mark.equals("B-S")) {
					tag = "2";
				} else if (mark.equals("E-S")) {
					tag = "3";
				}
				constructTerm.append(tag).append("\t");
				if (voctor != null) {
					for (int i = 0; i < voctor.length; ++i) {
						constructTerm.append((i + 1) + ":" + voctor[i]).append("\t");
					}
				} else {
					for (int i = 0; i < 200; ++i) {
						constructTerm.append((i + 1) + ":" + 1).append("\t");
					}
				}
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				constructTerm.setLength(0);
			}
			writer.newLine();
		}
		if (reader != null) {
			reader.close();
		}
		if (writer != null) {
			writer.close();
		}
		
	}
	
	
	
	private static String path = "D:\\yyh_yuanyuhu_graduation_experimental\\word_embedding_EN_CRF++\\dataset\\";
	
	
	public void doGenerateCRFCrossValidDataSet(){
		Word2Vec word2Vec = doGetWord2Vec();
		
		for(int i=0 ; i<10; ++i){
			StringBuffer testPath = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("test")
					.append(File.separator).append("test.crfsuite.txt");
			StringBuffer crfTestPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator).append("test");
			try {
				doGenerateCRFDataSet2Train(testPath.toString(),crfTestPath.toString(), word2Vec);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			StringBuffer trainPath = new StringBuffer(path).append("data")
					.append(i).append(File.separator).append("train")
					.append(File.separator).append("train.crfsuite.dat");
			StringBuffer crfTrainPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator).append("train");
			try {
				doGenerateCRFDataSet2Train(trainPath.toString(),crfTrainPath.toString(), word2Vec);
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			testPath.setLength(0);
			trainPath.setLength(0);
		}
		
	}
	
	
}








